1 Introduction and learning objectives

2 Load data

There are two sets of data, i) training data that has the actual prices ii) out of sample data that has the asking prices. Load both data sets.

Make sure you understand what information each column contains. Note that not all information provided might be useful in predicting house prices, but do not make any assumptions before you decide what information you use in your prediction algorithms.

#read in the data

london_house_prices_2019_training<-read.csv("training_data_assignment_with_prices.csv")
london_house_prices_2019_out_of_sample<-read.csv("test_data_assignment.csv")



#fix data types in both data sets

#fix dates
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate(date=as.Date(date))
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate(date=as.Date(date))
#change characters to factors
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate_if(is.character,as.factor)
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate_if(is.character,as.factor)

#take a quick look at what's in the data
str(london_house_prices_2019_training)
## 'data.frame':    13998 obs. of  37 variables:
##  $ ID                          : int  2 3 4 5 7 8 9 10 11 12 ...
##  $ date                        : Date, format: "2019-11-01" "2019-08-08" ...
##  $ postcode                    : Factor w/ 12635 levels "BR1 1AB","BR1 1LR",..: 10897 11027 11264 2031 11241 11066 421 9594 9444 873 ...
##  $ property_type               : Factor w/ 4 levels "D","F","S","T": 2 2 3 2 3 2 1 4 4 2 ...
##  $ whether_old_or_new          : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ freehold_or_leasehold       : Factor w/ 2 levels "F","L": 2 2 1 2 1 2 1 1 1 2 ...
##  $ address1                    : Factor w/ 2825 levels "1","1 - 2","1 - 3",..: 2503 792 253 789 569 234 264 418 5 274 ...
##  $ address2                    : Factor w/ 434 levels "1","10","101",..: 372 NA NA NA NA NA NA NA NA NA ...
##  $ address3                    : Factor w/ 8543 levels "ABBERTON WALK",..: 6990 6821 3715 2492 4168 2879 3620 5251 6045 6892 ...
##  $ town                        : Factor w/ 133 levels "ABBEY WOOD","ACTON",..: NA NA NA 78 NA NA NA NA NA NA ...
##  $ local_aut                   : Factor w/ 69 levels "ASHFORD","BARKING",..: 36 46 24 36 24 46 65 36 36 17 ...
##  $ county                      : Factor w/ 33 levels "BARKING AND DAGENHAM",..: 22 27 18 25 18 27 5 27 32 8 ...
##  $ postcode_short              : Factor w/ 247 levels "BR1","BR2","BR3",..: 190 194 198 28 198 194 4 169 167 8 ...
##  $ current_energy_rating       : Factor w/ 6 levels "B","C","D","E",..: 4 3 3 4 3 2 4 3 4 2 ...
##  $ total_floor_area            : num  30 50 100 39 88 101 136 148 186 65 ...
##  $ number_habitable_rooms      : int  2 2 5 2 4 4 6 6 6 3 ...
##  $ co2_emissions_current       : num  2.3 3 3.7 2.8 3.9 3.1 8.1 5.6 10 1.5 ...
##  $ co2_emissions_potential     : num  1.7 1.7 1.5 1.1 1.4 1.4 4.1 2 6.1 1.5 ...
##  $ energy_consumption_current  : int  463 313 212 374 251 175 339 216 308 128 ...
##  $ energy_consumption_potential: int  344 175 82 144 90 77 168 75 186 128 ...
##  $ windows_energy_eff          : Factor w/ 5 levels "Average","Good",..: 1 1 1 5 1 1 1 1 5 1 ...
##  $ tenure                      : Factor w/ 3 levels "owner-occupied",..: 1 2 1 2 1 1 1 2 1 1 ...
##  $ latitude                    : num  51.5 51.5 51.5 51.6 51.5 ...
##  $ longitude                   : num  -0.1229 -0.2828 -0.4315 0.0423 -0.4293 ...
##  $ population                  : int  34 75 83 211 73 51 25 91 60 97 ...
##  $ altitude                    : int  8 9 25 11 21 11 95 7 7 106 ...
##  $ london_zone                 : int  1 3 5 3 6 6 3 2 2 3 ...
##  $ nearest_station             : Factor w/ 592 levels "abbey road","abbey wood",..: 478 358 235 319 180 502 566 30 32 566 ...
##  $ water_company               : Factor w/ 5 levels "Affinity Water",..: 5 5 1 5 1 5 5 5 5 5 ...
##  $ average_income              : int  57200 61900 50600 45400 49000 56200 57200 65600 50400 52300 ...
##  $ district                    : Factor w/ 33 levels "Barking and Dagenham",..: 22 27 18 26 18 27 5 27 32 8 ...
##  $ price                       : num  360000 408500 499950 259999 395000 ...
##  $ type_of_closest_station     : Factor w/ 3 levels "light_rail","rail",..: 3 2 3 1 3 2 1 3 1 1 ...
##  $ num_tube_lines              : int  1 0 1 0 1 0 0 2 0 0 ...
##  $ num_rail_lines              : int  0 1 1 0 1 1 0 0 1 0 ...
##  $ num_light_rail_lines        : int  0 0 0 1 0 0 1 0 1 1 ...
##  $ distance_to_station         : num  0.528 0.77 0.853 0.29 1.073 ...
str(london_house_prices_2019_out_of_sample)
## 'data.frame':    1999 obs. of  37 variables:
##  $ ID                          : int  14434 12562 8866 10721 1057 1527 13961 12108 9363 1155 ...
##  $ date                        : Date, format: NA NA ...
##  $ postcode                    : logi  NA NA NA NA NA NA ...
##  $ property_type               : Factor w/ 4 levels "D","F","S","T": 1 2 2 3 4 3 2 3 2 4 ...
##  $ whether_old_or_new          : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ freehold_or_leasehold       : Factor w/ 2 levels "F","L": 1 2 2 1 1 1 2 1 2 1 ...
##  $ address1                    : logi  NA NA NA NA NA NA ...
##  $ address2                    : logi  NA NA NA NA NA NA ...
##  $ address3                    : logi  NA NA NA NA NA NA ...
##  $ town                        : Factor w/ 54 levels "ACTON","ADDISCOMBE",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ local_aut                   : logi  NA NA NA NA NA NA ...
##  $ county                      : logi  NA NA NA NA NA NA ...
##  $ postcode_short              : Factor w/ 221 levels "BR1","BR2","BR3",..: 82 50 37 52 214 150 159 115 175 126 ...
##  $ current_energy_rating       : Factor w/ 6 levels "B","C","D","E",..: 3 2 3 3 4 4 4 3 4 3 ...
##  $ total_floor_area            : num  150 59 58 74 97.3 ...
##  $ number_habitable_rooms      : int  6 2 2 5 5 5 5 4 2 5 ...
##  $ co2_emissions_current       : num  7.3 1.5 2.8 3.5 6.5 4.9 5.1 2.9 4.2 4.3 ...
##  $ co2_emissions_potential     : num  2.4 1.4 1.2 1.2 5.7 1.6 3 0.8 3.2 2.5 ...
##  $ energy_consumption_current  : int  274 142 253 256 303 309 240 224 458 253 ...
##  $ energy_consumption_potential: int  89 136 110 80 266 101 140 58 357 143 ...
##  $ windows_energy_eff          : Factor w/ 5 levels "Average","Good",..: 1 1 1 1 1 1 3 1 3 1 ...
##  $ tenure                      : Factor w/ 3 levels "owner-occupied",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ latitude                    : num  51.6 51.6 51.5 51.6 51.5 ...
##  $ longitude                   : num  -0.129 -0.2966 -0.0328 -0.3744 -0.2576 ...
##  $ population                  : int  87 79 23 73 100 24 22 49 65 98 ...
##  $ altitude                    : int  63 38 17 39 8 46 26 16 14 18 ...
##  $ london_zone                 : int  4 4 2 5 2 4 3 6 1 3 ...
##  $ nearest_station             : Factor w/ 494 levels "abbey wood","acton central",..: 16 454 181 302 431 142 20 434 122 212 ...
##  $ water_company               : Factor w/ 4 levels "Affinity Water",..: 4 1 4 1 4 4 4 2 4 4 ...
##  $ average_income              : int  61300 48900 46200 52200 60700 59600 64000 48100 56600 53500 ...
##  $ district                    : Factor w/ 32 levels "Barking and Dagenham",..: 9 4 29 14 17 10 31 15 19 22 ...
##  $ type_of_closest_station     : Factor w/ 3 levels "light_rail","rail",..: 3 3 1 2 3 2 3 3 3 2 ...
##  $ num_tube_lines              : int  1 2 0 0 2 0 1 1 2 0 ...
##  $ num_rail_lines              : int  0 1 0 1 0 1 1 0 0 1 ...
##  $ num_light_rail_lines        : int  0 1 1 0 0 0 0 1 0 0 ...
##  $ distance_to_station         : num  0.839 0.104 0.914 0.766 0.449 ...
##  $ asking_price                : num  750000 229000 152000 379000 930000 350000 688000 386000 534000 459000 ...
set.seed(2999)
#let's do the initial split
train_test_split <- initial_split(london_house_prices_2019_training, prop = 0.75) #training set contains 75% of the data
# Create the training dataset
train_data <- training(train_test_split)
test_data <- testing(train_test_split)

3 Visualize data

Visualize and examine the data. What plots could be useful here? What do you learn from these visualizations?

plotData <- london_house_prices_2019_training %>% 
         group_by(date) %>% 
         summarise(medianPrice = median(price))
ggplot(plotData, aes(x = date, y = medianPrice)) +
  geom_line(colour = "steelblue") +
  geom_point() +
  geom_smooth() +
  scale_y_continuous(labels = number) +
  theme_bw() +
  labs(x = "Date", y = "Daily Median Sales Price in GBP")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

There seems to be a slight seasonality in daily meadian house prices, with the summer being higher. However, this increase does not seem too drastic.

ggplot(london_house_prices_2019_training, aes(x = price)) +
  geom_density(color = "steelblue") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = number) +
  theme_bw() +
  labs(x = "Price in GBP", y = "Distribution")

House prices are heaviliy skewed to the right. This indicates that there are large outlying values. We should assess the impact of this on our models.

Estimate a correlation table between prices and other continuous variables. What do you glean from the correlation table?

A first idea about potential relations in the data as well as potential drivers of our dependent price variable.

# produce a correlation table using GGally::ggcorr()
# this takes a while to plot
london_house_prices_2019_training %>% 
  select(-ID) %>% #keep Y variable last
  ggcorr(method = c("pairwise", "pearson"), layout.exp = 2,label_round=2, label = TRUE,label_size = 2,hjust = 1,nbreaks = 5,size = 2) +
  theme_bw()

4 Fit a linear regression model

#Define control variables
lrControl <- trainControl (
    method="cv",
    number=5,
    verboseIter=FALSE) #by setting this to FALSE the model will not report its progress after each estimation

#we are going to train the model and report the results using k-fold cross validation
model1_lm<-train(
    price ~  district*total_floor_area + london_zone*number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
    train_data,
   method = "lm",
    trControl = lrControl
   )

# summary of the results
summary(model1_lm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3293917   -70924     -958    65630  4420578 
## 
## Coefficients:
##                                                     Estimate Std. Error t value
## (Intercept)                                       -7.974e+06  5.882e+06  -1.356
## districtBarnet                                    -1.185e+05  6.256e+04  -1.895
## districtBexley                                    -1.044e+05  6.336e+04  -1.647
## districtBrent                                     -1.893e+05  6.649e+04  -2.847
## districtBromley                                   -1.094e+05  6.160e+04  -1.776
## districtCamden                                    -3.438e+05  6.364e+04  -5.402
## `districtCity of London`                          -5.747e+05  1.936e+05  -2.968
## districtCroydon                                   -3.427e+04  6.245e+04  -0.549
## districtEaling                                    -8.515e+04  6.720e+04  -1.267
## districtEnfield                                   -2.595e+04  6.315e+04  -0.411
## districtGreenwich                                 -1.774e+05  6.261e+04  -2.833
## districtHackney                                   -1.823e+05  6.470e+04  -2.817
## `districtHammersmith and Fulham`                  -3.306e+05  6.838e+04  -4.835
## districtHaringey                                  -2.025e+05  6.711e+04  -3.018
## districtHarrow                                     5.428e+04  7.279e+04   0.746
## districtHavering                                   9.842e+03  5.891e+04   0.167
## districtHillingdon                                 1.055e+05  7.121e+04   1.481
## districtHounslow                                  -1.696e+05  7.149e+04  -2.373
## districtIslington                                 -3.711e+05  6.736e+04  -5.509
## `districtKensington and Chelsea`                  -6.229e+05  6.615e+04  -9.417
## `districtKingston upon Thames`                    -1.023e+05  6.656e+04  -1.536
## districtLambeth                                   -2.391e+05  6.483e+04  -3.688
## districtLewisham                                  -1.165e+05  6.362e+04  -1.830
## districtMerton                                    -3.036e+05  6.563e+04  -4.625
## districtNewham                                    -1.133e+05  7.367e+04  -1.538
## districtRedbridge                                 -1.162e+05  5.829e+04  -1.994
## `districtRichmond upon Thames`                    -2.257e+05  6.689e+04  -3.375
## districtSouthwark                                 -2.366e+05  6.372e+04  -3.712
## districtSutton                                    -5.012e+04  6.734e+04  -0.744
## `districtTower Hamlets`                           -2.664e+05  6.970e+04  -3.822
## `districtWaltham Forest`                          -5.270e+04  6.605e+04  -0.798
## districtWandsworth                                -2.214e+05  6.413e+04  -3.452
## districtWestminster                               -8.278e+05  6.732e+04 -12.296
## total_floor_area                                   2.365e+03  6.286e+02   3.762
## london_zone                                       -2.927e+04  6.179e+03  -4.736
## number_habitable_rooms                             9.204e+03  5.463e+03   1.685
## co2_emissions_potential                            2.004e+04  1.952e+03  10.265
## distance_to_station                               -1.463e+04  5.992e+03  -2.442
## `water_companyEssex & Suffolk Water`              -1.002e+04  2.684e+04  -0.373
## `water_companyLeep Utilities`                      8.071e+04  1.242e+05   0.650
## `water_companySES Water`                           6.685e+04  2.231e+04   2.996
## `water_companyThames Water`                        4.638e+04  1.484e+04   3.126
## property_typeF                                    -2.224e+05  1.229e+04 -18.090
## property_typeS                                    -1.121e+05  1.070e+04 -10.474
## property_typeT                                    -1.379e+05  1.102e+04 -12.512
## latitude                                           1.572e+05  1.139e+05   1.380
## longitude                                          2.996e+05  6.928e+04   4.324
## average_income                                     7.198e+00  3.177e-01  22.655
## `districtBarnet:total_floor_area`                  2.955e+03  6.456e+02   4.577
## `districtBexley:total_floor_area`                  6.191e+02  6.924e+02   0.894
## `districtBrent:total_floor_area`                   4.413e+03  6.779e+02   6.509
## `districtBromley:total_floor_area`                 1.196e+03  6.518e+02   1.835
## `districtCamden:total_floor_area`                  7.377e+03  6.630e+02  11.125
## `districtCity of London:total_floor_area`          1.383e+04  3.144e+03   4.398
## `districtCroydon:total_floor_area`                 3.160e+02  6.492e+02   0.487
## `districtEaling:total_floor_area`                  2.870e+03  6.712e+02   4.276
## `districtEnfield:total_floor_area`                 1.013e+03  6.764e+02   1.497
## `districtGreenwich:total_floor_area`               1.711e+03  6.790e+02   2.520
## `districtHackney:total_floor_area`                 3.889e+03  7.008e+02   5.549
## `districtHammersmith and Fulham:total_floor_area`  7.547e+03  6.742e+02  11.194
## `districtHaringey:total_floor_area`                3.790e+03  7.079e+02   5.354
## `districtHarrow:total_floor_area`                  1.522e+03  7.066e+02   2.154
## `districtHavering:total_floor_area`                3.929e+02  6.875e+02   0.571
## `districtHillingdon:total_floor_area`              1.448e+03  6.946e+02   2.085
## `districtHounslow:total_floor_area`                4.411e+03  7.071e+02   6.237
## `districtIslington:total_floor_area`               7.147e+03  7.230e+02   9.885
## `districtKensington and Chelsea:total_floor_area`  1.648e+04  6.619e+02  24.899
## `districtKingston upon Thames:total_floor_area`    3.190e+03  6.569e+02   4.856
## `districtLambeth:total_floor_area`                 4.166e+03  6.783e+02   6.142
## `districtLewisham:total_floor_area`                1.122e+03  6.842e+02   1.640
## `districtMerton:total_floor_area`                  4.557e+03  6.725e+02   6.776
## `districtNewham:total_floor_area`                  5.436e+02  8.322e+02   0.653
## `districtRedbridge:total_floor_area`               1.099e+03  6.658e+02   1.650
## `districtRichmond upon Thames:total_floor_area`    5.570e+03  6.508e+02   8.559
## `districtSouthwark:total_floor_area`               4.162e+03  6.857e+02   6.070
## `districtSutton:total_floor_area`                  8.412e+02  6.804e+02   1.236
## `districtTower Hamlets:total_floor_area`           3.459e+03  7.908e+02   4.374
## `districtWaltham Forest:total_floor_area`          1.031e+03  7.426e+02   1.388
## `districtWandsworth:total_floor_area`              4.373e+03  6.600e+02   6.625
## `districtWestminster:total_floor_area`             1.659e+04  6.876e+02  24.120
## `london_zone:number_habitable_rooms`              -5.964e+03  1.248e+03  -4.778
##                                                   Pr(>|t|)    
## (Intercept)                                       0.175230    
## districtBarnet                                    0.058183 .  
## districtBexley                                    0.099549 .  
## districtBrent                                     0.004418 ** 
## districtBromley                                   0.075719 .  
## districtCamden                                    6.74e-08 ***
## `districtCity of London`                          0.003004 ** 
## districtCroydon                                   0.583127    
## districtEaling                                    0.205171    
## districtEnfield                                   0.681090    
## districtGreenwich                                 0.004613 ** 
## districtHackney                                   0.004850 ** 
## `districtHammersmith and Fulham`                  1.35e-06 ***
## districtHaringey                                  0.002553 ** 
## districtHarrow                                    0.455872    
## districtHavering                                  0.867317    
## districtHillingdon                                0.138609    
## districtHounslow                                  0.017668 *  
## districtIslington                                 3.70e-08 ***
## `districtKensington and Chelsea`                   < 2e-16 ***
## `districtKingston upon Thames`                    0.124448    
## districtLambeth                                   0.000227 ***
## districtLewisham                                  0.067204 .  
## districtMerton                                    3.79e-06 ***
## districtNewham                                    0.124027    
## districtRedbridge                                 0.046214 *  
## `districtRichmond upon Thames`                    0.000742 ***
## districtSouthwark                                 0.000206 ***
## districtSutton                                    0.456776    
## `districtTower Hamlets`                           0.000133 ***
## `districtWaltham Forest`                          0.424926    
## districtWandsworth                                0.000560 ***
## districtWestminster                                < 2e-16 ***
## total_floor_area                                  0.000170 ***
## london_zone                                       2.21e-06 ***
## number_habitable_rooms                            0.092040 .  
## co2_emissions_potential                            < 2e-16 ***
## distance_to_station                               0.014605 *  
## `water_companyEssex & Suffolk Water`              0.708879    
## `water_companyLeep Utilities`                     0.515841    
## `water_companySES Water`                          0.002738 ** 
## `water_companyThames Water`                       0.001777 ** 
## property_typeF                                     < 2e-16 ***
## property_typeS                                     < 2e-16 ***
## property_typeT                                     < 2e-16 ***
## latitude                                          0.167614    
## longitude                                         1.55e-05 ***
## average_income                                     < 2e-16 ***
## `districtBarnet:total_floor_area`                 4.77e-06 ***
## `districtBexley:total_floor_area`                 0.371272    
## `districtBrent:total_floor_area`                  7.89e-11 ***
## `districtBromley:total_floor_area`                0.066463 .  
## `districtCamden:total_floor_area`                  < 2e-16 ***
## `districtCity of London:total_floor_area`         1.10e-05 ***
## `districtCroydon:total_floor_area`                0.626391    
## `districtEaling:total_floor_area`                 1.92e-05 ***
## `districtEnfield:total_floor_area`                0.134354    
## `districtGreenwich:total_floor_area`              0.011760 *  
## `districtHackney:total_floor_area`                2.94e-08 ***
## `districtHammersmith and Fulham:total_floor_area`  < 2e-16 ***
## `districtHaringey:total_floor_area`               8.79e-08 ***
## `districtHarrow:total_floor_area`                 0.031279 *  
## `districtHavering:total_floor_area`               0.567675    
## `districtHillingdon:total_floor_area`             0.037119 *  
## `districtHounslow:total_floor_area`               4.62e-10 ***
## `districtIslington:total_floor_area`               < 2e-16 ***
## `districtKensington and Chelsea:total_floor_area`  < 2e-16 ***
## `districtKingston upon Thames:total_floor_area`   1.21e-06 ***
## `districtLambeth:total_floor_area`                8.47e-10 ***
## `districtLewisham:total_floor_area`               0.100940    
## `districtMerton:total_floor_area`                 1.31e-11 ***
## `districtNewham:total_floor_area`                 0.513655    
## `districtRedbridge:total_floor_area`              0.098907 .  
## `districtRichmond upon Thames:total_floor_area`    < 2e-16 ***
## `districtSouthwark:total_floor_area`              1.32e-09 ***
## `districtSutton:total_floor_area`                 0.216370    
## `districtTower Hamlets:total_floor_area`          1.23e-05 ***
## `districtWaltham Forest:total_floor_area`         0.165203    
## `districtWandsworth:total_floor_area`             3.64e-11 ***
## `districtWestminster:total_floor_area`             < 2e-16 ***
## `london_zone:number_habitable_rooms`              1.79e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 213000 on 10417 degrees of freedom
## Multiple R-squared:  0.8286, Adjusted R-squared:  0.8273 
## F-statistic: 629.4 on 80 and 10417 DF,  p-value: < 2.2e-16

We have also seen how there are large outlying values in our response price variable.

#untransformed price variable
plot(lm(price ~  district*total_floor_area + london_zone*number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
    train_data))

#transformed price variable
plot(lm(log(price) ~  district*total_floor_area + london_zone*number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
    train_data))

In the above Plots, we can identify how residuals at extreme quartile ends diverge from normality. Let us therefore see how the performance of our linear regression changes when we do a log transformation of price.

#building log transformed price linear regression
model1_lm_log<-train(
    log(price) ~  district*total_floor_area + london_zone*number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
    train_data,
   method = "lm",
    trControl = lrControl
   )

summary(model1_lm_log)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52621 -0.11739  0.00723  0.13613  1.27259 
## 
## Coefficients:
##                                                     Estimate Std. Error t value
## (Intercept)                                       -9.068e+00  6.487e+00  -1.398
## districtBarnet                                     2.459e-01  6.900e-02   3.563
## districtBexley                                    -5.664e-02  6.987e-02  -0.811
## districtBrent                                      1.237e-01  7.332e-02   1.687
## districtBromley                                    1.318e-02  6.793e-02   0.194
## districtCamden                                     3.782e-01  7.018e-02   5.389
## `districtCity of London`                          -1.969e-01  2.135e-01  -0.922
## districtCroydon                                    1.025e-01  6.887e-02   1.489
## districtEaling                                     1.771e-01  7.411e-02   2.390
## districtEnfield                                    1.628e-01  6.964e-02   2.337
## districtGreenwich                                 -1.496e-02  6.905e-02  -0.217
## districtHackney                                    2.014e-01  7.136e-02   2.823
## `districtHammersmith and Fulham`                   3.366e-01  7.541e-02   4.463
## districtHaringey                                   1.685e-01  7.401e-02   2.277
## districtHarrow                                     2.451e-01  8.027e-02   3.053
## districtHavering                                   1.940e-01  6.497e-02   2.987
## districtHillingdon                                 2.477e-01  7.853e-02   3.154
## districtHounslow                                   7.712e-02  7.884e-02   0.978
## districtIslington                                  1.046e-01  7.428e-02   1.408
## `districtKensington and Chelsea`                   5.616e-01  7.295e-02   7.698
## `districtKingston upon Thames`                     2.149e-01  7.341e-02   2.928
## districtLambeth                                    2.521e-01  7.149e-02   3.526
## districtLewisham                                  -5.475e-03  7.016e-02  -0.078
## districtMerton                                     9.987e-03  7.238e-02   0.138
## districtNewham                                    -1.068e-01  8.124e-02  -1.314
## districtRedbridge                                  4.255e-02  6.428e-02   0.662
## `districtRichmond upon Thames`                     3.013e-01  7.377e-02   4.084
## districtSouthwark                                  1.499e-01  7.028e-02   2.133
## districtSutton                                     8.435e-03  7.427e-02   0.114
## `districtTower Hamlets`                           -1.114e-01  7.686e-02  -1.449
## `districtWaltham Forest`                           1.615e-01  7.284e-02   2.217
## districtWandsworth                                 1.924e-01  7.072e-02   2.720
## districtWestminster                                3.461e-01  7.424e-02   4.662
## total_floor_area                                   5.157e-03  6.932e-04   7.439
## london_zone                                       -8.825e-02  6.815e-03 -12.950
## number_habitable_rooms                             2.304e-02  6.024e-03   3.824
## co2_emissions_potential                            6.681e-03  2.152e-03   3.104
## distance_to_station                               -3.387e-02  6.608e-03  -5.126
## `water_companyEssex & Suffolk Water`               1.867e-02  2.960e-02   0.631
## `water_companyLeep Utilities`                      2.586e-01  1.370e-01   1.888
## `water_companySES Water`                           1.516e-01  2.460e-02   6.164
## `water_companyThames Water`                        1.413e-01  1.636e-02   8.639
## property_typeF                                    -4.116e-01  1.356e-02 -30.361
## property_typeS                                    -9.513e-02  1.180e-02  -8.058
## property_typeT                                    -1.531e-01  1.215e-02 -12.593
## latitude                                           4.111e-01  1.257e-01   3.272
## longitude                                          8.379e-02  7.641e-02   1.097
## average_income                                     1.319e-05  3.504e-07  37.645
## `districtBarnet:total_floor_area`                 -2.981e-04  7.120e-04  -0.419
## `districtBexley:total_floor_area`                 -2.817e-04  7.636e-04  -0.369
## `districtBrent:total_floor_area`                   1.264e-03  7.476e-04   1.690
## `districtBromley:total_floor_area`                 5.831e-06  7.188e-04   0.008
## `districtCamden:total_floor_area`                  5.293e-04  7.312e-04   0.724
## `districtCity of London:total_floor_area`          1.085e-02  3.468e-03   3.128
## `districtCroydon:total_floor_area`                -1.393e-03  7.159e-04  -1.945
## `districtEaling:total_floor_area`                 -1.777e-04  7.402e-04  -0.240
## `districtEnfield:total_floor_area`                -1.200e-03  7.459e-04  -1.609
## `districtGreenwich:total_floor_area`              -2.338e-05  7.488e-04  -0.031
## `districtHackney:total_floor_area`                 9.761e-04  7.729e-04   1.263
## `districtHammersmith and Fulham:total_floor_area`  1.121e-03  7.435e-04   1.507
## `districtHaringey:total_floor_area`                3.242e-04  7.807e-04   0.415
## `districtHarrow:total_floor_area`                 -2.923e-04  7.792e-04  -0.375
## `districtHavering:total_floor_area`               -9.168e-04  7.582e-04  -1.209
## `districtHillingdon:total_floor_area`             -3.094e-04  7.661e-04  -0.404
## `districtHounslow:total_floor_area`                1.183e-03  7.798e-04   1.517
## `districtIslington:total_floor_area`               2.807e-03  7.974e-04   3.521
## `districtKensington and Chelsea:total_floor_area`  2.408e-03  7.299e-04   3.299
## `districtKingston upon Thames:total_floor_area`   -1.472e-04  7.244e-04  -0.203
## `districtLambeth:total_floor_area`                -3.347e-04  7.481e-04  -0.447
## `districtLewisham:total_floor_area`                1.786e-04  7.546e-04   0.237
## `districtMerton:total_floor_area`                  7.302e-04  7.416e-04   0.985
## `districtNewham:total_floor_area`                 -4.339e-05  9.178e-04  -0.047
## `districtRedbridge:total_floor_area`              -5.857e-04  7.342e-04  -0.798
## `districtRichmond upon Thames:total_floor_area`    2.246e-04  7.177e-04   0.313
## `districtSouthwark:total_floor_area`               9.492e-04  7.562e-04   1.255
## `districtSutton:total_floor_area`                 -4.311e-04  7.503e-04  -0.575
## `districtTower Hamlets:total_floor_area`           2.900e-03  8.721e-04   3.326
## `districtWaltham Forest:total_floor_area`         -8.867e-04  8.190e-04  -1.083
## `districtWandsworth:total_floor_area`              6.087e-04  7.278e-04   0.836
## `districtWestminster:total_floor_area`             3.761e-03  7.583e-04   4.960
## `london_zone:number_habitable_rooms`               1.538e-03  1.376e-03   1.117
##                                                   Pr(>|t|)    
## (Intercept)                                       0.162161    
## districtBarnet                                    0.000368 ***
## districtBexley                                    0.417633    
## districtBrent                                     0.091690 .  
## districtBromley                                   0.846194    
## districtCamden                                    7.26e-08 ***
## `districtCity of London`                          0.356419    
## districtCroydon                                   0.136503    
## districtEaling                                    0.016879 *  
## districtEnfield                                   0.019442 *  
## districtGreenwich                                 0.828517    
## districtHackney                                   0.004766 ** 
## `districtHammersmith and Fulham`                  8.16e-06 ***
## districtHaringey                                  0.022819 *  
## districtHarrow                                    0.002274 ** 
## districtHavering                                  0.002826 ** 
## districtHillingdon                                0.001614 ** 
## districtHounslow                                  0.328011    
## districtIslington                                 0.159177    
## `districtKensington and Chelsea`                  1.51e-14 ***
## `districtKingston upon Thames`                    0.003417 ** 
## districtLambeth                                   0.000424 ***
## districtLewisham                                  0.937803    
## districtMerton                                    0.890260    
## districtNewham                                    0.188763    
## districtRedbridge                                 0.508002    
## `districtRichmond upon Thames`                    4.45e-05 ***
## districtSouthwark                                 0.032973 *  
## districtSutton                                    0.909579    
## `districtTower Hamlets`                           0.147291    
## `districtWaltham Forest`                          0.026625 *  
## districtWandsworth                                0.006536 ** 
## districtWestminster                               3.17e-06 ***
## total_floor_area                                  1.09e-13 ***
## london_zone                                        < 2e-16 ***
## number_habitable_rooms                            0.000132 ***
## co2_emissions_potential                           0.001914 ** 
## distance_to_station                               3.01e-07 ***
## `water_companyEssex & Suffolk Water`              0.528136    
## `water_companyLeep Utilities`                     0.059032 .  
## `water_companySES Water`                          7.38e-10 ***
## `water_companyThames Water`                        < 2e-16 ***
## property_typeF                                     < 2e-16 ***
## property_typeS                                    8.58e-16 ***
## property_typeT                                     < 2e-16 ***
## latitude                                          0.001073 ** 
## longitude                                         0.272798    
## average_income                                     < 2e-16 ***
## `districtBarnet:total_floor_area`                 0.675510    
## `districtBexley:total_floor_area`                 0.712157    
## `districtBrent:total_floor_area`                  0.091051 .  
## `districtBromley:total_floor_area`                0.993527    
## `districtCamden:total_floor_area`                 0.469180    
## `districtCity of London:total_floor_area`         0.001762 ** 
## `districtCroydon:total_floor_area`                0.051767 .  
## `districtEaling:total_floor_area`                 0.810323    
## `districtEnfield:total_floor_area`                0.107730    
## `districtGreenwich:total_floor_area`              0.975087    
## `districtHackney:total_floor_area`                0.206630    
## `districtHammersmith and Fulham:total_floor_area` 0.131819    
## `districtHaringey:total_floor_area`               0.677927    
## `districtHarrow:total_floor_area`                 0.707549    
## `districtHavering:total_floor_area`               0.226595    
## `districtHillingdon:total_floor_area`             0.686321    
## `districtHounslow:total_floor_area`               0.129343    
## `districtIslington:total_floor_area`              0.000432 ***
## `districtKensington and Chelsea:total_floor_area` 0.000974 ***
## `districtKingston upon Thames:total_floor_area`   0.838982    
## `districtLambeth:total_floor_area`                0.654574    
## `districtLewisham:total_floor_area`               0.812904    
## `districtMerton:total_floor_area`                 0.324866    
## `districtNewham:total_floor_area`                 0.962294    
## `districtRedbridge:total_floor_area`              0.425047    
## `districtRichmond upon Thames:total_floor_area`   0.754290    
## `districtSouthwark:total_floor_area`              0.209414    
## `districtSutton:total_floor_area`                 0.565597    
## `districtTower Hamlets:total_floor_area`          0.000885 ***
## `districtWaltham Forest:total_floor_area`         0.278979    
## `districtWandsworth:total_floor_area`             0.402972    
## `districtWestminster:total_floor_area`            7.17e-07 ***
## `london_zone:number_habitable_rooms`              0.264011    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2349 on 10417 degrees of freedom
## Multiple R-squared:  0.8164, Adjusted R-squared:  0.815 
## F-statistic: 579.1 on 80 and 10417 DF,  p-value: < 2.2e-16

4.1 Predict the values in testing and out of sample data

Below I use the predict function to test the performance of the model in testing data and summarize the performance of the linear regression model. How can you measure the quality of your predictions?

We can assess the performance through 1) the fit of the model obtained through cross validation and 2) through out of sample testing and aiming for RMSE minimisation.

#results of untransformed price variable
lrPredictions <- predict(model1_lm,test_data)

lr_results<-data.frame(  RMSE = RMSE(lrPredictions, test_data$price), 
                            Rsquare = R2(lrPredictions, test_data$price))
lr_results 
##       RMSE   Rsquare
## 1 204467.4 0.8570818
#results of transformed price variable

loggedPredictions <- exp(predict(model1_lm_log,test_data))

logged_lr_results<-data.frame(  RMSE = RMSE(loggedPredictions, test_data$price), 
                            Rsquare = R2(loggedPredictions, test_data$price))
logged_lr_results 
##       RMSE   Rsquare
## 1 615799.8 0.5978141

As we can see, the log transformed model performs significantly worse, especially in out of sample testing. We therefore decide to stay with untransformed price as a response variable.

# we can check variable importance as well
importance <- varImp(model1_lm, scale=TRUE)
plot(importance)

train_data %>% 
  group_by(district) %>% 
  summarise(medianPrice = median(price)) %>% 
  ggplot(aes(x = medianPrice, y = fct_reorder(district, medianPrice))) +
    geom_col(fill = "steelblue") +
    theme_classic()

Comparing the above plot with the plot on variable importance, we can see that interaction variables are especially important between floor area and districts that have high median prices in housing. This lets us to hypothesise that floor area has specifically high leverage on a houses valuation when it is located in an upscale area.

5 Fit a tree model

modelLookup("rpart") #To mitigate decision trees’ shortcomings, the introduction of a complexity parameter cp and an associated cost function penalising a tree’s size can be useful. In the case at hand, an array of values for cp are iterated/tuned through and optimal values are found by cross validating RMSEs. As trees run the risk of overfitting as their size increases, this pruning measure allows for optimising variance.
##   model parameter                label forReg forClass probModel
## 1 rpart        cp Complexity Parameter   TRUE     TRUE      TRUE
treeControl <- trainControl(
    method="cv",
    number=5,
    verboseIter=FALSE)

treeGrid <- expand.grid(cp = seq(0.0000, 0.001,0.00001))

model2_tree <- train(
    price ~ district*total_floor_area + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income,
  train_data,
  method = "rpart",
  trControl = treeControl,
  tuneLength=10,
  tuneGrid = treeGrid
    )

plot(model2_tree)

treeImportance <- varImp(model2_tree, scale=TRUE)
plot(treeImportance)

The plot on the complexity parameter against the cross validated RMSE performance nicely indicates the optimally tuned penalising parameter. It is furthermore interesting to see how variable importance gets is different to , which can possibly be attributed to trees susceptibility to over emphasise variables with large outlying values.

# We can predict the testing values

treePredictions <- predict(model2_tree,test_data)

tree_results<-data.frame(  RMSE = RMSE(treePredictions, test_data$price), 
                            Rsquare = R2(treePredictions, test_data$price))
tree_results                         
##       RMSE   Rsquare
## 1 257971.1 0.7706503

One can argue that the poor OOS performance of the Regression Trees is caused by the algorithm’s tendency towards high variance. Applying this notion to the problem of predicting prices, it is likely that the presence of large outlying values in training sets heavily influences predictions to extreme degrees. Thus, this model is not considered viable for further analyses, especially because random forests represent an improvement to simpler regression trees.

6 Other algorithms

Use at least two other algorithms to predict prices. Don’t forget to tune the parameters of these algorithms. And then compare the performances of your algorithms to linear regression and trees.

modelLookup("svmRadial")
##       model parameter label forReg forClass probModel
## 1 svmRadial     sigma Sigma   TRUE     TRUE      TRUE
## 2 svmRadial         C  Cost   TRUE     TRUE      TRUE
# Tuning of C and Sigma is not done in a pre-specified grid fashion, but is left over to the algorithm to get a sense for the outcomes in a broader and less constrained way.

svmControl <- trainControl(method="cv", number=5, verboseIter = FALSE) #again, train controlling is done through cross validation

model3_svm <- train(
    price ~ total_floor_area + district + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income,
  train_data,
  method = "svmRadial",
  trControl = svmControl,
  tuneLength = 10)

svmImportance <- varImp(model3_svm, scale=TRUE)
plot(svmImportance)

# lets also assess the out of sample performance of support vector regression
svmPredictions <- predict(model3_svm,test_data)

svm_results<-data.frame(  RMSE = RMSE(svmPredictions, test_data$price), 
                            Rsquare = R2(svmPredictions, test_data$price))
svm_results   
##     RMSE   Rsquare
## 1 179665 0.8936471
#next, we construct random forests

modelLookup("ranger")
##    model     parameter                         label forReg forClass probModel
## 1 ranger          mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE
## 2 ranger     splitrule                Splitting Rule   TRUE     TRUE      TRUE
## 3 ranger min.node.size             Minimal Node Size   TRUE     TRUE      TRUE
rfControl <- trainControl(verboseIter = FALSE, method = "cv", number = 5)

# Tuning on mtry (number of predictors randomly sampled at each split), splitrule and min.node.size is not done in a pre-specified grid fashion, but is left over to the algorithm to get a sense for the outcomes in a broader and less constrained way.

model4_randomForests <- train(
    price ~ total_floor_area + district + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income,
    na.action = na.omit,
    train_data,
    method = "ranger",
    trControl = rfControl
)
#we also assess the out of sample performance of the constructed trees
rfPredictions <- predict(model4_randomForests,test_data)

rf_results<-data.frame(  RMSE = RMSE(rfPredictions, test_data$price), 
                            Rsquare = R2(rfPredictions, test_data$price))
rf_results
##       RMSE  Rsquare
## 1 197243.4 0.871355
#as a last algorithm, we engage in gradient boosting models

#lets tune the characteristics of the built trees
# tuning is mostly done to mitigate overfitting/variance by adjusting multiple parameters, especially through limiting n.tree (the number of trees/iterations) and interaction.depth (the number of nodes). At the heart of GBM is shrinkage, regularising the impact of additionally added trees to limit susceptibility to overfitting
gbmGrid <- expand.grid(interaction.depth=c(1, 3, 5), n.trees = (0:50)*50,shrinkage=c(0.01, 0.001), n.minobsinnode=10)

metricToMinimise <- "RMSE"
trainControl <- trainControl(method="cv", number=5, verboseIter = FALSE)

model5_gbm <- train(
    price ~ total_floor_area + district + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type, 
    train_data,
    distribution="gaussian",
    method="gbm",
    trControl=trainControl,
    tuneGrid=gbmGrid,
    metric=metricToMinimise, 
    bag.fraction=0.75)                  
#out of sample testing of gbm
gbmPredictions <- predict(model5_gbm,test_data)

gbm_results<-data.frame( RMSE = RMSE(gbmPredictions, test_data$price), 
                            Rsquare = R2(gbmPredictions, test_data$price))
gbm_results
##       RMSE   Rsquare
## 1 189858.3 0.8782757
models <- c("Linear Regression", "Regression Trees", "Support Vector Machines", "Random Forests", "Gradient Boosting Methods")
resultSummary = cbind(models, rbind(lr_results, tree_results, svm_results, rf_results, gbm_results))
resultSummary
##                      models     RMSE   Rsquare
## 1         Linear Regression 204467.4 0.8570818
## 2          Regression Trees 257971.1 0.7706503
## 3   Support Vector Machines 179665.0 0.8936471
## 4            Random Forests 197243.4 0.8713550
## 5 Gradient Boosting Methods 189858.3 0.8782757

Our constructed Support Vector Machines seem to be most accurate in out of sample testing. We will focus on the last three models for ensembling.

7 Stacking

stackingControl <- trainControl(method="cv", number=5, verboseIter = FALSE)

#first, lets look at the best tunes from each of the single underlying models such that we can take the chosen parameters for stacking
model3_svm$bestTune
model4_randomForests$bestTune
model5_gbm$bestTune

modelList <- caretList(
    price ~ total_floor_area + district + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
  train_data,
  trControl = stackingControl,
  tuneList = list(
    #we create a model list with the best tunes of each model
    svm = caretModelSpec(method = "svmRadial", tuneGrid = data.frame(sigma = 0.01226624 , C = 32)),
    ranger = caretModelSpec(method = "ranger", tuneGrid = data.frame(mtry=47,splitrule="extratrees",min.node.size=5)),
    gbm = caretModelSpec(method = "gbm", tuneGrid = data.frame(n.trees = 2500, interaction.depth = 5, shrinkage = 0.01, n.minobsinnode = 10))
  )
)

model6_stacking <- caretStack(
  modelList,
  method = "glm", #we use logistic regression as a combiner algorithm to balance between underlying models
  metric = "RMSE",
  trControl = stackingControl
)

stackingPredictions <- predict(model6_stacking,test_data)

stacking_results<-data.frame(RMSE = RMSE(stackingPredictions, test_data$price), 
                            Rsquare = R2(stackingPredictions, test_data$price))
stacking_results
##     RMSE   Rsquare
## 1 179790 0.8910761

As we can see, this one-time oos testing has given us a slighlty worse performance than support vector machines. However, it is likely to assume that across different tests, stacking will be more consistent and robust due to mitigated overfitting.

#lets look at the confidence intervals for RMSEs of each underlying model
resamples <- resamples(modelList)

dotplot(resamples, metric = "RMSE")

modelCor(resamples)
##              svm    ranger       gbm
## svm    1.0000000 0.9493160 0.9007801
## ranger 0.9493160 1.0000000 0.8410876
## gbm    0.9007801 0.8410876 1.0000000

The above plot aids in determining the robustness of our out of sample performance result. As we can see, the width of each confidence interval across the underlying algorithms are rather similar. This hints at a well balanced choice of models for the ensemble algorithm. As a next step, lets assess what happens when we include linear regression in the stacking algorithm. Maybe this has the potential to enhance the final result, since it might counteract the tendency to overfit of the existing three models.

withLRmodelList <- caretList(
    price ~ total_floor_area + district + london_zone + number_habitable_rooms + co2_emissions_potential + distance_to_station +water_company+property_type+latitude+ longitude + average_income + property_type,
  train_data,
  trControl = stackingControl,
  tuneList = list(
    lr = caretModelSpec(method = "lm"),
    svm = caretModelSpec(method = "svmRadial", tuneGrid = data.frame(sigma = 0.01226624 , C = 32)),
    ranger = caretModelSpec(method = "ranger", tuneGrid = data.frame(mtry=47,splitrule="extratrees",min.node.size=5)),
    gbm = caretModelSpec(method = "gbm", tuneGrid = data.frame(n.trees = 2500, interaction.depth = 5, shrinkage = 0.01, n.minobsinnode = 10))
  ))
## Warning in trControlCheck(x = trControl, y = target): trControl$savePredictions
## not 'all' or 'final'. Setting to 'final' so we can ensemble the models.
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
withLRmodel7_stacking <- caretStack(
  withLRmodelList,
  method = "glm",
  metric = "RMSE",
  trControl = stackingControl
)
withLRstackingPredictions <- predict(withLRmodel7_stacking,test_data)

withLRstacking_results<-data.frame(RMSE = RMSE(withLRstackingPredictions, test_data$price), 
                            Rsquare = R2(withLRstackingPredictions, test_data$price))
withLRstacking_results
##       RMSE   Rsquare
## 1 181850.1 0.8888764

As we can see, the introduction of linear regression to our stacking ensemble actually reduces the out of sample performance slightly. Lets not consider it for our final model.

8 Pick investments

We want to maximiize the average percentage return/profit on invested houses. This means that we have to predict actual prices/valuations as accurately as possible. We thus choose the stacking model (excluding Linear Regression) as it was showing the best out of sample performance.

oos<-london_house_prices_2019_out_of_sample

#predict the value of houses
oos$predict <- predict(model6_stacking,oos)
#Choose the ones you want to invest here
#Make sure you choose exactly 200 of them

mutOos <- oos %>% 
  mutate(absProfit = predict - asking_price, roi = (predict - asking_price)/asking_price)

#output your choices. Change the name of the file to your "lastname_firstname.csv"
write.csv(mutOos,"my_submission.csv")